1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.lucene.analysis.pattern;
19
20 import java.io.IOException;
21 import java.io.StringReader;
22 import java.util.ArrayList;
23 import java.util.List;
24 import java.util.regex.Pattern;
25
26 import org.apache.lucene.analysis.Analyzer;
27 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
28 import org.apache.lucene.analysis.CharFilter;
29 import org.apache.lucene.analysis.TokenStream;
30 import org.apache.lucene.analysis.Tokenizer;
31 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
32 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
33 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
34
35 public class TestPatternTokenizer extends BaseTokenStreamTestCase
36 {
37 public void testSplitting() throws Exception
38 {
39 String qpattern = "\\'([^\\']+)\\'";
40 String[][] tests = {
41
42 { "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" },
43 { "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" },
44 { "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" },
45 { "-1", ":", "boo:and:foo", "boo and foo" },
46 { "-1", "o", "boo:and:foo", "b :and:f" },
47 { "0", ":", "boo:and:foo", ": :" },
48 { "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
49 { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
50 };
51
52 for( String[] test : tests ) {
53 TokenStream stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile(test[1]), Integer.parseInt(test[0]));
54 ((Tokenizer)stream).setReader(new StringReader(test[2]));
55 String out = tsToString( stream );
56
57
58 assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
59
60
61
62
63
64
65
66
67
68
69
70
71 }
72 }
73
74 public void testOffsetCorrection() throws Exception {
75 final String INPUT = "Günther Günther is here";
76
77
78 List<String> mappingRules = new ArrayList<>();
79 mappingRules.add( "\"ü\" => \"ü\"" );
80 NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
81 builder.add("ü", "ü");
82 NormalizeCharMap normMap = builder.build();
83 CharFilter charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
84
85
86 Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1);
87 stream.setReader(charStream);
88 assertTokenStreamContents(stream,
89 new String[] { "Günther", "Günther", "is", "here" },
90 new int[] { 0, 13, 26, 29 },
91 new int[] { 12, 25, 28, 33 },
92 INPUT.length());
93
94 charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
95 stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0);
96 stream.setReader(charStream);
97 assertTokenStreamContents(stream,
98 new String[] { "Günther", "Günther" },
99 new int[] { 0, 13 },
100 new int[] { 12, 25 },
101 INPUT.length());
102 }
103
104
105
106
107 private static String tsToString(TokenStream in) throws IOException {
108 StringBuilder out = new StringBuilder();
109 CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
110
111
112 in.clearAttributes();
113 termAtt.setEmpty().append("bogusTerm");
114 in.reset();
115 while (in.incrementToken()) {
116 if (out.length() > 0)
117 out.append(' ');
118 out.append(termAtt.toString());
119 in.clearAttributes();
120 termAtt.setEmpty().append("bogusTerm");
121 }
122
123 in.close();
124 return out.toString();
125 }
126
127
128 public void testRandomStrings() throws Exception {
129 Analyzer a = new Analyzer() {
130 @Override
131 protected TokenStreamComponents createComponents(String fieldName) {
132 Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1);
133 return new TokenStreamComponents(tokenizer);
134 }
135 };
136 checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
137 a.close();
138
139 Analyzer b = new Analyzer() {
140 @Override
141 protected TokenStreamComponents createComponents(String fieldName) {
142 Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0);
143 return new TokenStreamComponents(tokenizer);
144 }
145 };
146 checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
147 b.close();
148 }
149
150
151 public void testHeapFreedAfterClose() throws Exception {
152
153
154
155 StringBuilder b = new StringBuilder();
156 for(int i=0;i<1024;i++) {
157
158 for(int j=0;j<1023;j++) {
159 b.append(' ');
160 }
161 b.append('x');
162 }
163
164 String big = b.toString();
165
166 Pattern x = Pattern.compile("x");
167
168 List<Tokenizer> tokenizers = new ArrayList<>();
169 for(int i=0;i<512;i++) {
170 Tokenizer stream = new PatternTokenizer(x, -1);
171 tokenizers.add(stream);
172 stream.setReader(new StringReader(big));
173 stream.reset();
174 for(int j=0;j<1024;j++) {
175 assertTrue(stream.incrementToken());
176 }
177 assertFalse(stream.incrementToken());
178 stream.end();
179 stream.close();
180 }
181 }
182 }